#####
#This is the main file to replicate the tables used in the article TITLE
#This makes most of the tables and graphs used
#To make the main replication of WHC's study as close as possible to theirs, Stata is used for table1_rep.tex and table2_rep.tex
#to make these models, run whc_rep.do after running this file
#Author: Richard J. McAlexander
#email: richardmcalexander@gmail.com
#date: Date
#RStudio Version: 1.2.1335
#clear everything
rm(list=ls())

#LOAD PACKAGES
library(tidyverse)
library(readr)
library(readxl)
library(miceadds)
library(margins)
library(interflex)
library(stargazer)
library(effects)

#IMPORT DATA
WHC_replication_panel <- read_csv("WHC_replication_panel.csv")
WHC_replication <- read_delim("WHC_replication.tab", "\t", escape_double = FALSE, trim_ws = TRUE) 
lange_data <- suppressWarnings(read_excel("lange_data.xlsx", 
                                          col_types = c("text", "numeric", "numeric")))

#MERGE LANGE'S MEASURE OF INDIRECT RULE WITH WHC'S DATA
WHC_replication <- merge(WHC_replication,lange_data,all.x=TRUE,by.x = "countryname", by.y = "country")

#flip custom_court to measure size of indirect rule
WHC_replication$indirect_rule <- WHC_replication$custom_court/100

#make interaction column for indirect rule * coastal distance
WHC_replication$coastdist_indirect <- WHC_replication$coastdist*WHC_replication$indirect_rule
WHC_replication$coast_police <- WHC_replication$coastdist*WHC_replication$police

#FILTER TO INCLUDE ONLY UNITS IN SAMPLE
WHC_replication <- WHC_replication[WHC_replication$sample1==1,]

#MAKE SEPARATE DATAFRAME USING ONLY BRITISH COLONIES
WHC_replication_british_only <- filter(WHC_replication, britishcol_flag == 1)


# MAKE THE MAIN TABLE USED BY WHC -------------------------------------------------------------
#table 1
#(1) center periphery
table1_1 = (glm(status_egip ~   coastdist +indirect_rule+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol , data=WHC_replication,family = binomial(link = "probit")))
#(2) differential legacies
table1_2 = (glm(status_egip ~  coastdist+coastdist_indirect+indirect_rule+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol, data=WHC_replication,family = binomial(link = "probit")))

#THIS USES THE ALTERNATE MEASURES USING LANGE'S INDIRECT RULE
#(1) center periphery
table1_1_pol = (glm(status_egip ~   coastdist +police+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol , data=na.omit(dplyr::select(WHC_replication,status_egip,coastdist,police,groupsize,lnarea,lncarea,lnpop,lngdp,indep_viol)),family = binomial(link = "probit")))
#(2) differential legacies
table1_2_pol = (glm(status_egip ~  coastdist*police+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol, data=na.omit(dplyr::select(WHC_replication,status_egip,coastdist,police,groupsize,lnarea,lncarea,lnpop,lngdp,indep_viol)),family = binomial(link = "probit")))


#MAKE CLUSTER STANDARD ERRORS
cluster_WHC_rep <- na.omit(dplyr::select(WHC_replication,status_egip,coastdist,police,groupsize,lnarea,lncarea,lnpop,lngdp,indep_viol,countryname))
table1_1_pol_se <- miceadds::glm.cluster( data=cluster_WHC_rep, formula=status_egip ~   coastdist+police+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol,cluster="countryname", family=binomial(link = "probit"))
table1_1_pol_se_summary <- summary(table1_1_pol_se)

table1_2_pol_se <- miceadds::glm.cluster( data=cluster_WHC_rep, formula=status_egip ~ coastdist*police+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol,cluster="countryname", family=binomial(link = "probit"))
table1_2_pol_se_summary <- summary(table1_2_pol_se)

#THIS CREATES TABLE 1 IN THE APPENDIX
sink("table1_rep_police.tex")
stargazer(header = F,
          list(table1_1_pol,table1_2_pol),
          omit=c("factor","Constant"),
          no.space = T,
          covariate.labels = c("ln Distance to Coast","Colonial Police","Group Size", "ln Group Area", "ln Country Area", "ln Population", "ln GDP p.c.", "Violent Independence","ln Distance to Coast x \n Colonial Police"),
          #add.lines = list(c("District Fixed Effects?", "No", "No","No","No","Yes")),
          #omit.table.layout = "n",
          se = list(table1_1_pol_se_summary[,2],table1_2_pol_se_summary[,2]),
          suppress.errors = F,float = F,
          font.size="small",
          #star.cutoffs = c(0.05, 0.01, 0.001),
          omit.stat = c("AIC","ll"),
          #float.env = "sidewaystable",
          dep.var.labels =c("Ethnic Inclusion")
)
sink()

#MAKE TABLE 2 IN THE APPENDIX
putterman_data <- read_csv("putterman_data.csv", 
                           col_types = cols(Statext0_1950_2000 = col_number(), 
                                            Statext1_1901_1950 = col_number(), 
                                            `Statext2_1851-1900` = col_number())) %>% na.omit()

putterman_data$cowid <- countrycode::countrycode(putterman_data$wbcode_iso3c,"iso3c","cown")
colnames(putterman_data)[5] <-"stateness"
WHC_replication$cowid <- as.integer(WHC_replication$cowid)
WHC_replication_putterman <- merge(WHC_replication,putterman_data[,5:6], by = "cowid",all.x=TRUE)
WHC_replication_putterman <- WHC_replication_putterman[!is.na(WHC_replication_putterman$cowid),]


putterman1 <- (glm(status_egip ~  stateness+coastdist+indirect_rule+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol, data=WHC_replication_putterman,family = binomial(link = "probit")))
putterman2 <- (glm(status_egip ~  stateness+coastdist*indirect_rule+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol, data=WHC_replication_putterman,family = binomial(link = "probit")))


sink("putterman_models.tex")
stargazer(header = F,
          list(putterman1,putterman2),
          omit=c("factor","Constant"),
          no.space = T,
          covariate.labels = c("Pre-colonial Stateness","ln Distance to Coast","Indirect Rule","Group Size", "ln Group Area", "ln Country Area", "ln Population", "ln GDP p.c.", "Violent Independence","ln Distance to Coast x \n Indirect Rule"),
          #add.lines = list(c("District Fixed Effects?", "No", "No","No","No","Yes")),
          #omit.table.layout = "n",
          suppress.errors = F,float = F,
          font.size="small",
          #star.cutoffs = c(0.05, 0.01, 0.001),
          omit.stat = c("AIC","ll"),
          #float.env = "sidewaystable",
          dep.var.labels =c("Ethnic Inclusion")
)
sink()

#MAKE PLOT OF PREDICTED PROBABILITIES
#figure 1 in the appendix
inter.kernel(Y = "status_egip", D = "coastdist", X = "indirect_rule",Z=c("groupsize","lnarea","lncarea","lnpop","lngdp","indep_viol"), data = na.omit(dplyr::select(WHC_replication,status_egip,coastdist,indirect_rule,groupsize,lnarea,lncarea,lnpop,lngdp,indep_viol,countryname)),na.rm = TRUE, parallel = TRUE, cores = 4,main="Marginal Effect",Xlabel="Indirect Rule",Ylabel = "Ethnic Inclusion",Dlabel="Distance to Coast")
ggsave("interflex_plot.pdf",height=8,width=8,units="in",device = cairo_pdf)

#figure 2 in the appendix
pp <- (glm(status_egip ~  coastdist*indirect_rule+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol, data=WHC_replication,family = binomial(link = "probit")))
effect_plot <- effects::effect("coastdist*indirect_rule", pp)

pdf("probit_interaction_plot.pdf",height=8,width=10) 
plot(effect_plot, axes=list(
  y=list(lab="Prob(Inclusion)")),
  xlab="Distance to Coast",main = "Distance to Coast & Indirect Rule Plot")
dev.off()

####FIX BEFORE SUBMITTING
####USE HARIRI'S IMPUTED DATA AND MAKE TABLES
load("hariri_data.RData") 

table <- dplyr::select(table,country,indirect_rule) %>% unique() %>% na.omit()
colnames(table) <- c("countryname","indirect_rule_predicted")

WHC_replication <- merge(WHC_replication,table,all.x=TRUE)
WHC_replication$indirect_rule_imputed <- WHC_replication$indirect_rule
WHC_replication$indirect_rule_imputed[is.na(WHC_replication$indirect_rule)] <- WHC_replication$indirect_rule_predicted[is.na(WHC_replication$indirect_rule)]


imputed_1 <- (glm(status_egip ~   coastdist +indirect_rule_imputed+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol , data=WHC_replication,family = binomial(link = "probit")))

imputed_2 <- (glm(status_egip ~  coastdist*indirect_rule_imputed+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol, data=WHC_replication,family = binomial(link = "probit")))


imputed_df <- na.omit(dplyr::select(WHC_replication,indirect_rule_imputed,status_egip,coastdist,police,groupsize,lnarea,lncarea,lnpop,lngdp,indep_viol,countryname))
imputed_1_se <- miceadds::glm.cluster( data=imputed_df, formula=status_egip ~coastdist +indirect_rule_imputed+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol,cluster="countryname", family=binomial(link = "probit"))
imputed_1_se_summary <- summary(imputed_1_se)

imputed_2_se <- miceadds::glm.cluster( data=imputed_df, formula=status_egip ~coastdist*indirect_rule_imputed+ groupsize + lnarea + lncarea + lnpop + lngdp + indep_viol,cluster="countryname", family=binomial(link = "probit"))
imputed_2_se_summary <- summary(imputed_2_se)

#
sink("table_imputed.tex")
stargazer(header = F,
          list(imputed_1,imputed_2),
          omit=c("factor","Constant"),
          no.space = T,
          covariate.labels = c("ln Distance to Coast","Indirect Rule (Imputed)","Group Size", "ln Group Area", "ln Country Area", "ln Population", "ln GDP p.c.", "Violent Independence","ln Distance to Coast x \n Indirect Rule (Imputed)"),
          #add.lines = list(c("District Fixed Effects?", "No", "No","No","No","Yes")),
          #omit.table.layout = "n",
          se = list(imputed_1_se_summary[,2],imputed_2_se_summary[,2]),
          suppress.errors = F,float = F,
          font.size="small",
          #star.cutoffs = c(0.05, 0.01, 0.001),
          omit.stat = c("AIC","ll"),
          #float.env = "sidewaystable",
          dep.var.labels =c("Ethnic Inclusion")
)
sink()

#MAKE INTERACTION TERM FOR IMPUTED INDIRECT RULE AND COASTDIST
WHC_replication$coastdist_indirect_imputed <- WHC_replication$coastdist*WHC_replication$indirect_rule_imputed

#WRITE THE FILE FOR USE WITH STATA
write.csv(WHC_replication_british_only,file="WHC_replication_british_only.csv")

